if (!require("pacman")) install.packages("pacman")
pacman::p_load(haven, tidyverse)

# Load and prepare individual data
ppfad <- haven::read_dta("data_not_upload/ppathl.dta") %>%
    dplyr::select(hid, pid, syear, sex, gebjahr, birthregion) %>%
    mutate(born_east = 1 * birthregion > 11) %>%
    filter(born_east == 1) %>%
    filter(between(gebjahr, 1966, 1975)) %>%
    filter(syear %in% c(2017:2019)) %>%
    mutate(sex = ifelse(sex > 0, sex, NA)) %>%
    mutate(female = 1 * (sex == 2)) %>%
    dplyr::select(hid, pid, syear, sex, gebjahr, born_east) %>%
    group_by(pid) %>%
    filter(syear == max(syear)) %>%
    ungroup()

# Load education data
pgen <- haven::read_dta("data_not_upload/pgen.dta") %>%
    dplyr::select(pgnation, pgisced11, pid, syear) %>%
    mutate(
        college_degree = 1 * (pgisced11 > 5),
        high_school_degree = 1 * (pgisced11 > 2)
    ) %>%
    dplyr::select(-pgnation, -pgisced11)

# Load household data
hpathl <- haven::read_dta("data_not_upload/hpathl.dta") %>%
    dplyr::select(hid, syear, sampreg) %>%
    mutate(
        sampreg = ifelse(sampreg < 0, NA, sampreg),
        east_now = 1 * (sampreg == 2)
    ) %>%
    dplyr::select(-sampreg)

# Merge datasets and prepare for analysis
soep <- ppfad %>%
    left_join(hpathl) %>%
    left_join(pgen) %>%
    mutate(
        moved_east_to_west = ifelse(born_east == 1 & east_now == 0, 1, 0),
        post = gebjahr > 1970,
        east_now_lab = ifelse(east_now == 1,
            "Women born in East,\nnow living in East",
            "Women born in East,\nnow living in West"
        )
    ) %>%
    filter(!is.na(east_now))

# Calculate aggregate statistics
soep_agg <- soep %>%
    group_by(gebjahr) %>%
    summarise(m = mean(east_now)) %>%
    ungroup() %>%
    mutate(post = gebjahr > 1970)

# Create plot
p1 <- soep %>%
    ggplot(aes(gebjahr, 100 * east_now, group = factor(post))) +
    geom_vline(xintercept = 1970.5) +
    geom_bar(data = soep_agg, aes(gebjahr, 100 * m), stat = "identity", width = 0.8) +
    theme_bw() +
    ylab("Share of female respondents\nwho reside in East Germany (%)") +
    xlab("Year of birth") +
    scale_x_continuous(breaks = 1965:2000)

p1
